Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 1.9, 4 
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
else:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
In [4]:
df
Out[4]:
date speed_max speed dir HrMn type wind_type
0 20000101 10.0 7.25 265.54 0000 default default
1 20000101 11.0 7.54 282.57 0100 default default
2 20000101 9.0 5.99 300.36 0200 default default
3 20000101 13.0 7.57 316.85 0300 default default
4 20000101 11.0 7.55 326.70 0400 default default
5 20000101 10.0 5.72 333.54 0500 default default
6 20000101 13.0 7.84 359.81 0600 default default
7 20000101 17.0 7.66 9.60 0700 default default
8 20000101 15.0 9.01 10.83 0800 default default
9 20000101 15.0 8.97 22.07 0900 default default
10 20000101 17.0 11.09 43.60 1000 default default
11 20000101 15.0 8.08 59.34 1100 default default
12 20000101 16.0 9.98 64.34 1200 default default
13 20000101 13.0 8.63 73.64 1300 default default
14 20000101 14.0 9.11 73.27 1400 default default
15 20000101 16.0 9.43 76.12 1500 default default
16 20000101 15.0 8.78 68.38 1600 default default
17 20000101 14.0 9.30 63.63 1700 default default
18 20000101 15.0 9.25 68.60 1800 default default
19 20000101 17.0 9.10 68.59 1900 default default
20 20000101 15.0 9.75 61.73 2000 default default
21 20000101 15.0 9.66 65.37 2100 default default
22 20000101 12.0 7.53 48.52 2200 default default
23 20000101 12.0 7.94 45.52 2300 default default
24 20000102 9.0 6.92 31.84 0000 default default
25 20000102 13.0 8.38 38.74 0100 default default
26 20000102 12.0 9.04 41.45 0200 default default
27 20000102 13.0 8.53 46.76 0300 default default
28 20000102 13.0 8.94 49.42 0400 default default
29 20000102 17.0 10.15 47.74 0500 default default
... ... ... ... ... ... ... ...
140860 20161230 12.0 8.29 148.22 1500 default default
140861 20161230 18.0 10.55 163.86 1600 default default
140862 20161230 17.0 11.02 171.45 1700 default default
140863 20161230 18.0 11.54 176.00 1800 default default
140864 20161230 19.0 10.65 173.49 1900 default default
140865 20161230 8.0 3.75 196.33 2000 default default
140866 20161230 10.0 2.98 240.04 2100 default default
140867 20161230 16.0 8.37 271.96 2200 default default
140868 20161230 21.0 11.02 291.91 2300 default default
140869 20161231 21.0 11.02 278.03 0000 default default
140870 20161231 22.0 12.44 285.56 0100 default default
140871 20161231 22.0 12.03 290.84 0200 default default
140872 20161231 21.0 12.82 300.28 0300 default default
140873 20161231 20.0 11.11 300.06 0400 default default
140874 20161231 22.0 13.55 333.92 0500 default default
140875 20161231 21.0 12.67 327.34 0600 default default
140876 20161231 24.0 13.80 322.47 0700 default default
140877 20161231 24.0 15.98 324.02 0800 default default
140878 20161231 22.0 13.95 318.25 0900 default default
140879 20161231 15.0 9.97 313.95 1000 default default
140880 20161231 16.0 10.38 308.63 1100 default default
140881 20161231 15.0 9.10 288.69 1200 default default
140882 20161231 14.0 9.22 274.42 1300 default default
140883 20161231 12.0 7.33 263.12 1400 default default
140884 20161231 14.0 8.47 239.63 1500 default default
140885 20161231 14.0 8.32 234.34 1600 default default
140886 20161231 13.0 7.68 225.05 1700 default default
140887 20161231 13.0 5.99 205.11 1800 default default
140888 20161231 13.0 7.02 204.43 1900 default default
140889 20161231 13.0 7.69 206.57 2000 default default

140890 rows × 7 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date speed_max speed dir HrMn month dir_windrose
count 1.408900e+05 140890.000000 140890.000000 140890.000000 140890.000000 140890.000000 140890.000000
mean 2.008173e+07 13.783746 7.817171 195.383235 1150.363404 6.524217 196.856811
std 4.882175e+04 6.476898 4.134382 96.638849 692.392299 3.456021 99.056700
min 2.000010e+07 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
25% 2.004052e+07 9.000000 4.700000 123.570000 500.000000 4.000000 126.100000
50% 2.008091e+07 13.000000 7.340000 192.690000 1200.000000 7.000000 196.990000
75% 2.012111e+07 18.000000 10.450000 285.880000 1800.000000 10.000000 288.680000
max 2.016123e+07 76.000000 31.900000 359.980000 2300.000000 12.000000 359.990000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xc7a86a0>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
True
In [10]:
# df['decimal'] = df.speed % 1
# df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
# if 'knot_unit' not in globals():
#     knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
#     if knot_unit:
#         df['speed'] = df['speed'] * 1.943845
#         df['decimal'] = df.speed % 1
#         df.decimal.hist(alpha=0.5, label='knot')
#         # need more elaboration, some is not near an integer
#         df['speed'] = df['speed'].apply(lambda x: int(round(x)))
#     plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
# df.drop(['decimal'], 1,inplace=True)
# print(knot_unit)
In [11]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [12]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [13]:
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [14]:
df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0xc1c9b00>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [15]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

sudden increase in speed

In [16]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date speed_max speed dir HrMn type wind_type month dir_windrose incre incre_reverse
time
2014-06-14 12:00:00 20140614 56.0 31.90 293.63 1200 default default 6 156.37 8.04 7.09
2001-04-07 09:00:00 20010407 45.0 29.69 253.22 900 default default 4 196.78 4.57 1.22
2000-04-05 20:00:00 20000405 51.0 29.36 165.55 2000 default default 4 284.45 3.80 0.82
2016-11-18 15:00:00 20161118 51.0 28.81 118.13 1500 default default 11 331.87 4.33 5.72
2000-04-05 21:00:00 20000405 43.0 28.54 155.62 2100 default default 4 294.38 -0.82 6.01
2001-04-07 10:00:00 20010407 43.0 28.47 246.85 1000 default default 4 203.15 -1.22 0.18
2001-04-07 11:00:00 20010407 42.0 28.29 236.30 1100 default default 4 213.70 -0.18 0.46
2001-04-07 12:00:00 20010407 40.0 27.83 228.62 1200 default default 4 221.38 -0.46 0.08
2001-04-07 13:00:00 20010407 42.0 27.75 224.03 1300 default default 4 225.97 -0.08 2.55
2010-10-26 19:00:00 20101026 52.0 27.29 209.15 1900 default default 10 240.85 0.45 0.92
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0xc9e6358>
In [17]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 0
date speed_max speed dir HrMn type wind_type month dir_windrose incre incre_reverse
time
2014-06-14 12:00:00 20140614 56.0 31.90 293.63 1200 default default 6 156.37 8.04 7.09
2001-04-07 09:00:00 20010407 45.0 29.69 253.22 900 default default 4 196.78 4.57 1.22
2000-04-05 20:00:00 20000405 51.0 29.36 165.55 2000 default default 4 284.45 3.80 0.82
2016-11-18 15:00:00 20161118 51.0 28.81 118.13 1500 default default 11 331.87 4.33 5.72
2000-04-05 21:00:00 20000405 43.0 28.54 155.62 2100 default default 4 294.38 -0.82 6.01
2001-04-07 10:00:00 20010407 43.0 28.47 246.85 1000 default default 4 203.15 -1.22 0.18
2001-04-07 11:00:00 20010407 42.0 28.29 236.30 1100 default default 4 213.70 -0.18 0.46
2001-04-07 12:00:00 20010407 40.0 27.83 228.62 1200 default default 4 221.38 -0.46 0.08
2001-04-07 13:00:00 20010407 42.0 27.75 224.03 1300 default default 4 225.97 -0.08 2.55
2010-10-26 19:00:00 20101026 52.0 27.29 209.15 1900 default default 10 240.85 0.45 0.92

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [18]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0.00      4
0.01      1
0.02      1
0.03      1
0.05      3
0.06      1
0.07      4
0.08      1
0.09      1
0.10      1
0.11      3
0.12      1
0.13      2
0.14      1
0.15      1
0.16      1
0.17      2
0.19      1
0.20      2
0.21      5
0.22      1
0.23      1
0.24      1
0.25      6
0.26      4
0.27      2
0.28      1
0.29      3
0.31      2
0.32      3
         ..
359.66    1
359.69    3
359.70    1
359.71    3
359.72    1
359.73    1
359.74    3
359.75    2
359.76    2
359.77    1
359.78    1
359.79    3
359.80    2
359.81    4
359.82    3
359.84    4
359.85    3
359.86    4
359.87    2
359.88    4
359.89    1
359.90    3
359.91    3
359.92    1
359.93    1
359.94    2
359.95    3
359.96    3
359.97    5
359.98    1
Name: dir, dtype: int64
1 10
In [19]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [20]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0160740718664
In [21]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
Series([], Name: speed, dtype: int64)

1.5 Time Shift Comparison

In [22]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [23]:
df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [24]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date speed_max speed dir HrMn type wind_type month dir_windrose
time
In [25]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
2000 - 2004
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
2005 - 2009
2010 - 2014
2015 - 2016
In [26]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[26]:
(0, 11.0)
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
In [28]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [29]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [30]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)

1.7 Generate (x,y) from (speed,dir)

In [31]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [32]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [33]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: default
Sampling time used: [0]
Out[33]:
date speed_max speed dir HrMn month dir_windrose x y
count 4.253800e+04 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000
mean 2.012079e+07 13.810546 7.537785 196.145684 1150.524237 6.529433 197.253630 -0.682973 -0.058287
std 1.418683e+04 6.634189 4.159124 97.243295 691.425208 3.460651 100.082629 6.116975 6.019186
min 2.010010e+07 1.000000 0.000000 0.020000 0.000000 1.000000 0.000000 -25.695173 -29.225280
25% 2.011040e+07 9.000000 4.362500 122.592500 600.000000 4.000000 124.490000 -4.501345 -4.042897
50% 2.012071e+07 13.000000 6.960000 190.365000 1200.000000 7.000000 196.525000 -0.498976 -0.435592
75% 2.013101e+07 18.000000 10.220000 288.890000 1700.000000 10.000000 290.890000 3.389413 4.045007
max 2.014123e+07 59.000000 31.900000 359.970000 2300.000000 12.000000 359.990000 25.003446 21.919299
In [34]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x16ea29e8>
In [35]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x182388d0>
In [36]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [37]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [38]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [39]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [40]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [41]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 7.76 s

2.3 Overview by Month

In [42]:
%%time
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        title = 'Month: %s' % (month)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 18.5 s
In [43]:
df.describe()
Out[43]:
date speed_max speed dir HrMn month dir_windrose x y
count 4.253800e+04 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000 42538.000000
mean 2.012079e+07 13.810546 7.537785 196.145684 1150.524237 6.529433 197.253630 -0.682973 -0.058287
std 1.418683e+04 6.634189 4.159124 97.243295 691.425208 3.460651 100.082629 6.116975 6.019186
min 2.010010e+07 1.000000 0.000000 0.020000 0.000000 1.000000 0.000000 -25.695173 -29.225280
25% 2.011040e+07 9.000000 4.362500 122.592500 600.000000 4.000000 124.490000 -4.501345 -4.042897
50% 2.012071e+07 13.000000 6.960000 190.365000 1200.000000 7.000000 196.525000 -0.498976 -0.435592
75% 2.013101e+07 18.000000 10.220000 288.890000 1700.000000 10.000000 290.890000 3.389413 4.045007
max 2.014123e+07 59.000000 31.900000 359.970000 2300.000000 12.000000 359.990000 25.003446 21.919299

3. Create input data and configuration

In [44]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [45]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1
   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16]
In [46]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [47]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [48]:
%%time
if 'bandwidth' not in globals():
#     bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
1.1
Wall time: 0 ns
In [49]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 1.1 1089
[  1.71463588e-07   1.03813122e-06   3.70719443e-06   8.64717452e-06
   1.72495859e-05]
In [50]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [51]:
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [52]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(50)) 
Wall time: 37.4 s
In [53]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()
In [54]:
# %%time
year_length = 5
gofs_bivariate = []
df_start_year, df_end_year = df_all_years.index.year[0], df_all_years.index.year[-1]
for start_year in arange(df_start_year, df_end_year-year_length):
    end_year = start_year+year_length-1
    df_previous = df_all_years[str(start_year):str(end_year)]
    speed_previous = array(list(zip(df_previous.x, df_previous.y)))
    kde2 = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_previous)
    kde_result2 = exp(kde2.score_samples(points))
    gofs_bivariate.append(goodness_of_fit_summary(kde_result2, kde_result))
gofs_bivariate=pd.DataFrame(gofs_bivariate)
gofs_bivariate.index = arange(df_start_year, df_end_year-year_length)
In [55]:
gofs_bivariate
Out[55]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
2000 0.063165 0.041921 1.000423e-07 0.053182 0.350144 0.926725
2001 0.055314 0.042885 9.082731e-08 0.050674 0.333629 0.933475
2002 0.055837 0.035839 7.763107e-08 0.046848 0.308442 0.943140
2003 0.042191 0.034822 5.975643e-08 0.041103 0.270612 0.956232
2004 0.030823 0.026601 3.955410e-08 0.033440 0.220166 0.971029
2005 0.020047 0.013565 1.982346e-08 0.023674 0.155864 0.985481
2006 0.011131 0.012994 1.036004e-08 0.017114 0.112677 0.992412
2007 0.007216 0.011943 6.994842e-09 0.014063 0.092586 0.994877
2008 0.006487 0.013004 7.831834e-09 0.014880 0.097969 0.994264
2009 0.003581 0.009093 5.785576e-09 0.012789 0.084203 0.995762
2010 0.000000 0.000000 0.000000e+00 0.000000 0.000000 1.000000
In [56]:
gofs_bivariate.plot(y='R_square', figsize=(4,3))
gofs_bivariate.plot(y='K_S', figsize=(4,3))
align_figures()

univariate gof standard

In [57]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x):
    df_previous = df_all_years[str(start_year):str(end_year)]
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, normed=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s}
In [58]:
x = arange(0, df.speed.max() + 1)
fig = plt.figure()
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

for year_length in arange(5, 11):
    df_standard = df_all_years[str(2010):str(2014)]
    density, _ = np.histogram(df_standard['speed'], bins=x, normed=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)

    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x) 
            for start_year in arange(df_start_year, df_end_year-year_length)]

    gofs = pd.DataFrame(gofs)
    if len(gofs)>0:
        ax1.plot(gofs.year, gofs.r_square, label=year_length)
        ax2.plot(gofs.year, gofs.k_s, label=year_length)
plt.legend()
Out[58]:
<matplotlib.legend.Legend at 0x17aeb828>

5. GMM by Expectation-maximization

In [59]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [60]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[60]:
weight mean_x mean_y sig_x sig_y corr
1 0.308 4.616 -4.945 4.506 4.674 0.224
2 0.291 -1.520 -1.473 3.413 3.403 0.108
3 0.207 -7.948 4.815 4.708 5.010 0.245
4 0.195 -0.097 4.602 4.172 4.468 -0.046
In [61]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.307605700658 [[ 4.61605015 -4.94514214]] [ 4.03537368  5.08565922] 139.634027935
0.290962918536 [[-1.52028891 -1.47293892]] [ 3.21861634  3.58775211] -45.8147426533
0.206550388194 [[-7.94824886  4.81506627]] [ 4.20438252  5.43906082] 142.131628929
0.194880992612 [[-0.0966457   4.60221175]] [ 4.14025744  4.49741557] -162.955391747
In [62]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [63]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[63]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.958 0.019 0.046 5.785883e-08 0.040 0.266
In [64]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [65]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [66]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[66]:
     fun: -18.16342688075353
     jac: array([  1.70488715e+00,   2.38418579e-07,  -2.38418579e-07,
         0.00000000e+00,   0.00000000e+00,   2.38418579e-07,
         1.70488858e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   2.38418579e-07,   2.38418579e-07,
         1.70488667e+00,  -2.38418579e-07,   0.00000000e+00,
         2.38418579e-07,   2.38418579e-07,   4.76837158e-07,
         1.70487952e+00,  -2.38418579e-07,   0.00000000e+00,
         2.38418579e-07,   2.38418579e-07,   0.00000000e+00,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 2368
     nit: 90
    njev: 90
  status: 0
 success: True
       x: array([ 0.18573458,  4.62368942, -6.44614001,  4.31351674,  4.40541475,
        0.28409451,  0.09335169, -1.52215872, -1.64819632,  1.98730558,
        2.5695959 ,  0.3647571 ,  0.66006817, -2.50191072,  2.45625315,
        5.98629863,  5.69371339, -0.15045077,  0.06084556,  3.36099208,
       -3.3712317 ,  1.72588479,  1.79676329, -0.2452108 ])

6.1 GMM Result

In [67]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[67]:
weight mean_x mean_y sig_x sig_y corr
1 0.660 -2.502 2.456 5.986 5.694 -0.150
2 0.186 4.624 -6.446 4.314 4.405 0.284
3 0.093 -1.522 -1.648 1.987 2.570 0.365
4 0.061 3.361 -3.371 1.726 1.797 -0.245
In [68]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.660068168468 [[-2.50191072  2.45625315]] [ 5.35927992  6.28746867] -125.785719938
0.185734577736 [[ 4.62368942 -6.44614001]] [ 3.6869511   4.94170996] 137.12204979
0.0933516903929 [[-1.52215872 -1.64819632]] [ 1.72894577  2.75008238] 152.73060665
0.0608455634032 [[ 3.36099208 -3.3712317 ]] [ 1.52739591  1.96827293] -139.661728543

6.2 Goodness-of-fit statistics

In [69]:
gof_df(gmm_pdf_result, kde_result)
Out[69]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.007 0.021 1.293373e-08 0.019 0.126
In [70]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [71]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='', ylabel='', colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='', ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [72]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [73]:
%%time
x = arange(0, max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x= x)
Wall time: 12.6 s
In [74]:
%%time
# Calculate Speed Distribution
# 1. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.99812788733352131, 0.9977932464295628, 0.99769357117477808)
Wall time: 18.7 s
In [75]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.012627160527 0.00752659657105
6.5 5.5
Wall time: 43.8 s
In [76]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
Direction Distribution Comparison
In [77]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.914404537059
Wall time: 12.6 s

6.3 Sectoral Comaprison

In [78]:
# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))  
In [79]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 5.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [80]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 1360 weight 0.031971413794724714
GMM Weibull
R square 0.929669076052 0.942528986989
max diff: 0.0564587253859 0.0367562636013 speed value: 5.27578947368 5.27578947368 y gmm 0.380723431268
 
25.0 (15.0 - 35.0) degree
data size: 1459 weight 0.034298744651840705
GMM Weibull
R square 0.911744894359 0.98052300526
max diff: 0.0951557319072 0.0206310529974 speed value: 4.70947368421 7.06421052632 y gmm 0.361776705177
 
45.0 (35.0 - 55.0) degree
data size: 968 weight 0.02275612393624524
GMM Weibull
R square 0.962873537347 0.975418370795
max diff: 0.0380079347408 0.0152491277742 speed value: 6.25894736842 8.34526315789 y gmm 0.522942478482
 
65.0 (55.0 - 75.0) degree
data size: 1515 weight 0.03561521463162349
GMM Weibull
R square 0.957121134652 0.960211832936
max diff: 0.0400628922484 0.0142755086137 speed value: 6.55052631579 8.42210526316 y gmm 0.50977209125
 
85.0 (75.0 - 95.0) degree
data size: 2375 weight 0.05583243217828765
GMM Weibull
R square 0.99023010745 0.990982292735
max diff: 0.0215058947364 0.0216839458001 speed value: 10.4589473684 2.32421052632 y gmm 0.771757263158
 
105.0 (95.0 - 115.0) degree
data size: 2461 weight 0.05785415393295407
GMM Weibull
R square 0.952273916781 0.91812126947
max diff: 0.0410177776448 0.0727045893793 speed value: 13.4378947368 4.13473684211 y gmm 0.869994006183
 
125.0 (115.0 - 135.0) degree
data size: 3093 weight 0.0727114579905026
GMM Weibull
R square 0.971263049423 0.965368247489
max diff: 0.0359191065921 0.0529765154913 speed value: 5.40631578947 5.40631578947 y gmm 0.236631814843
 
145.0 (135.0 - 155.0) degree
data size: 3837 weight 0.09020170200761672
GMM Weibull
R square 0.956312317482 0.983959047806
max diff: 0.0497194949638 0.0249104999968 speed value: 7.65789473684 6.38157894737 y gmm 0.404020406001
 
165.0 (155.0 - 175.0) degree
data size: 2938 weight 0.06906765715360384
GMM Weibull
R square 0.968841409031 0.975485899038
max diff: 0.0434291941931 0.0415824606332 speed value: 12.7705263158 5.67578947368 y gmm 0.815042536603
 
185.0 (175.0 - 195.0) degree
data size: 1989 weight 0.0467581926747849
GMM Weibull
R square 0.97472682006 0.984833440394
max diff: 0.0354314210077 0.0253900883697 speed value: 13.3852631579 8.51789473684 y gmm 0.90069034509
 
205.0 (195.0 - 215.0) degree
data size: 1964 weight 0.04617048286238187
GMM Weibull
R square 0.968608843828 0.953533497438
max diff: 0.0338271126725 0.0261538309823 speed value: 14.3631578947 8.61789473684 y gmm 0.965599006766
 
225.0 (215.0 - 235.0) degree
data size: 2377 weight 0.05587944896327989
GMM Weibull
R square 0.96812173913 0.961711305227
max diff: 0.0346329107292 0.0355164457242 speed value: 3.21157894737 2.14105263158 y gmm 0.22157239007
 
245.0 (235.0 - 255.0) degree
data size: 2107 weight 0.04953218298932719
GMM Weibull
R square 0.912646151879 0.968995090742
max diff: 0.0767837166206 0.0344624117092 speed value: 3.14526315789 2.09684210526 y gmm 0.202286050821
 
265.0 (255.0 - 275.0) degree
data size: 1866 weight 0.043866660397762
GMM Weibull
R square 0.943688039067 0.957687722945
max diff: 0.0637214977343 0.022468373359 speed value: 4.25894736842 4.25894736842 y gmm 0.276578609447
 
285.0 (275.0 - 295.0) degree
data size: 2856 weight 0.06713996896892191
GMM Weibull
R square 0.949387537048 0.9885071704
max diff: 0.0663677375222 0.04554959208 speed value: 5.03684210526 6.71578947368 y gmm 0.27960303164
 
305.0 (295.0 - 315.0) degree
data size: 3687 weight 0.08667544313319855
GMM Weibull
R square 0.984446647404 0.97037401798
max diff: 0.0269664701781 0.0296074461367 speed value: 8.31473684211 4.15736842105 y gmm 0.606068517617
 
325.0 (315.0 - 335.0) degree
data size: 3588 weight 0.08434811227608256
GMM Weibull
R square 0.937711110072 0.953623837361
max diff: 0.0798175949114 0.0423332638695 speed value: 5.84736842105 3.50842105263 y gmm 0.375868023818
 
345.0 (335.0 - 355.0) degree
data size: 2100 weight 0.04936762424185434
GMM Weibull
R square 0.935397144132 0.981172514503
max diff: 0.0784448097449 0.0372926103787 speed value: 7.34526315789 4.89684210526 y gmm 0.541778143078
 
Wall time: 1min 12s
In [81]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9564666353366787 0.9680542907149142
In [82]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.05017869321192174 0.03523264623750921
In [83]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [84]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [85]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [86]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
25.0 (15.0 - 35.0) Degree Speed Distribution
0.110360750797 5.5 0.445521820023

6.4.2 Time Variability

In [87]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log
25.0 (15.0 - 35.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability

In [88]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [89]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
25.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [90]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
1.1 square_error

7.1 Variability of the Result

In [91]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.664 -2.405 2.406 6.042 5.711 -0.138
2 0.184 4.467 -6.417 4.361 4.356 0.259
3 0.094 -1.530 -1.672 1.987 2.458 0.344
4 0.058 3.354 -3.427 1.662 1.777 -0.243
GMM Plot Result
0.664197406814 [[-2.40471154  2.40616194]] [ 5.42410765  6.30055145] -123.871232434
0.183969869346 [[ 4.46739952 -6.41672469]] [ 3.75236336  4.89008271] -45.1403718634
0.0937047310336 [[-1.53038233 -1.6720305 ]] [ 1.73653066  2.64058026] 150.975062225
0.058127992806 [[ 3.35414561 -3.42735773]] [ 1.48879747  1.92493353] -142.683177267
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.022 1.413388e-08 0.020 0.132
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.019 1.349934e-08 0.020 0.129

weight mean_x mean_y sig_x sig_y corr
1 0.662 -2.489 2.460 5.991 5.611 -0.142
2 0.191 4.356 -6.519 4.443 4.297 0.223
3 0.085 -1.507 -1.628 1.941 2.471 0.343
4 0.061 3.438 -3.382 1.719 1.775 -0.272
GMM Plot Result
0.662220198033 [[-2.4891342   2.46013765]] [ 5.33072094  6.24156611] -122.656541245
0.191403198416 [[ 4.35578806 -6.51896989]] [ 3.84635684  4.8385529 ] -49.2689490267
0.0854616168586 [[-1.50659664 -1.62751897]] [ 1.70807298  2.63730105] 152.683003206
0.060914986692 [[ 3.43780127 -3.38210043]] [ 1.48943738  1.97186576] -138.340220491
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.022 1.422655e-08 0.020 0.132
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.020 1.329289e-08 0.019 0.128

weight mean_x mean_y sig_x sig_y corr
1 0.661 -2.471 2.444 6.002 5.721 -0.141
2 0.186 4.564 -6.670 4.234 4.346 0.290
3 0.086 -1.614 -1.749 1.911 2.566 0.402
4 0.067 3.287 -3.343 1.795 1.754 -0.238
GMM Plot Result
0.660898890535 [[-2.47082518  2.44390283]] [ 5.41089603  6.28341755] -125.596235584
0.186061426636 [[ 4.56420657 -6.6701166 ]] [ 3.61168067  4.87588677] 137.559932465
0.0860622215261 [[-1.61379398 -1.74920428]] [ 1.63208317  2.75216209] 153.334304375
0.0669774613029 [[ 3.28664124 -3.34251129]] [ 1.54829079  1.97513909] -132.27166842
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.022 1.328117e-08 0.019 0.128
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.010 0.021 1.425199e-08 0.020 0.132

weight mean_x mean_y sig_x sig_y corr
1 0.651 -2.584 2.485 5.976 5.691 -0.129
2 0.189 4.673 -6.517 4.299 4.521 0.280
3 0.098 -1.488 -1.670 1.980 2.630 0.333
4 0.062 3.456 -3.324 1.721 1.825 -0.213
GMM Plot Result
0.65074944803 [[-2.5839793   2.48490674]] [ 5.41711809  6.2245044 ] -124.644780728
0.189428760704 [[ 4.67257172 -6.51722497]] [ 3.73383338  4.99840283] 140.099015275
0.0976919682232 [[-1.48844643 -1.66980656]] [ 1.76786081  2.77670995] 155.381117358
0.0621298230424 [[ 3.45586822 -3.32377058]] [ 1.56629086  1.960005  ] -142.699025622
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.007 0.022 1.444031e-08 0.020 0.133
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.020 1.330701e-08 0.019 0.128

weight mean_x mean_y sig_x sig_y corr
1 0.665 -2.399 2.420 5.912 5.637 -0.146
2 0.195 4.493 -6.229 4.552 4.406 0.250
3 0.080 -1.520 -1.740 1.943 2.392 0.399
4 0.060 3.389 -3.361 1.769 1.743 -0.260
GMM Plot Result
0.664754888944 [[-2.39875017  2.41999755]] [ 5.31509918  6.20257111] -125.945829966
0.19504666241 [[ 4.49296848 -6.22890011]] [ 3.87490388  5.01129795] -48.7197907962
0.080386180969 [[-1.51962731 -1.73990259]] [ 1.62935245  2.61591969] 148.829491395
0.0598122676769 [[ 3.38889417 -3.36119354]] [ 1.51045638  1.97181676] -133.360909023
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.007 0.021 1.492764e-08 0.021 0.135
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.020 1.358163e-08 0.020 0.129

weight mean_x mean_y sig_x sig_y corr
1 0.702 -2.210 2.069 6.081 5.841 -0.195
2 0.159 4.624 -6.690 4.153 4.248 0.286
3 0.080 -1.567 -1.672 1.931 2.365 0.402
4 0.060 3.240 -3.286 1.751 1.795 -0.313
GMM Plot Result
0.701968071974 [[-2.21002248  2.06864429]] [ 5.33529015  6.52899819] -129.159366728
0.158550928449 [[ 4.62351232 -6.69045631]] [ 3.54815867  4.76470237] 137.26994541
0.0799267089642 [[-1.56658288 -1.67170537]] [ 1.61296801  2.5925831 ] 148.470988337
0.0595542906134 [[ 3.23959629 -3.28621106]] [ 1.46817791  2.03265407] -137.290770497
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.020 1.368062e-08 0.019 0.130
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.021 1.350041e-08 0.020 0.129

weight mean_x mean_y sig_x sig_y corr
1 0.652 -2.552 2.477 5.921 5.653 -0.150
2 0.190 4.767 -6.453 4.360 4.469 0.298
3 0.094 -1.530 -1.502 2.014 2.636 0.408
4 0.064 3.320 -3.395 1.774 1.799 -0.237
GMM Plot Result
0.652165088983 [[-2.55158698  2.47685549]] [ 5.31555613  6.22619156] -126.420057211
0.189547955595 [[ 4.76670569 -6.45277965]] [ 3.69517755  5.03258631] 137.38112946
0.0939250395805 [[-1.52964146 -1.50172469]] [ 1.70274024  2.8470188 ] 151.860755853
0.0643619158413 [[ 3.32041024 -3.39463042]] [ 1.55974605  1.98741771] -136.72813993
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.007 0.021 1.243702e-08 0.019 0.123
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.021 1.339595e-08 0.019 0.128

weight mean_x mean_y sig_x sig_y corr
1 0.642 -2.593 2.670 5.962 5.594 -0.120
2 0.200 4.641 -6.351 4.377 4.341 0.273
3 0.097 -1.456 -1.590 1.998 2.631 0.326
4 0.061 3.342 -3.286 1.690 1.743 -0.205
GMM Plot Result
0.642399371131 [[-2.59312784  2.67032242]] [ 5.3739117   6.16120889] -121.02620815
0.19982528426 [[ 4.64127794 -6.35129323]] [ 3.71674668  4.91793196] -45.8684443611
0.0971733147889 [[-1.45577143 -1.58976241]] [ 1.78947311  2.77766527] 155.249844989
0.0606020298203 [[ 3.34243959 -3.28552838]] [ 1.52838332  1.88660729] -139.274825152
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.006 0.019 1.246788e-08 0.019 0.124
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.021 1.344603e-08 0.019 0.128

weight mean_x mean_y sig_x sig_y corr
1 0.713 -2.230 2.130 6.071 5.770 -0.185
2 0.148 4.702 -6.858 4.062 4.214 0.326
3 0.082 -1.500 -1.769 1.917 2.390 0.375
4 0.057 3.409 -3.466 1.712 1.806 -0.259
GMM Plot Result
0.712841034143 [[-2.22984815  2.12955912]] [ 5.32395261  6.46551714] -127.333037457
0.148032848821 [[ 4.70183731 -6.85767405]] [ 3.3939095   4.76839375] 138.211740559
0.0820209801214 [[-1.49979598 -1.76934868]] [ 1.64212332  2.58688322] 150.345654474
0.0571051369148 [[ 3.40916858 -3.4659042 ]] [ 1.50959989  1.97856919] -140.840622784
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.026 1.384623e-08 0.020 0.130
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.011 0.023 1.391491e-08 0.020 0.131

weight mean_x mean_y sig_x sig_y corr
1 0.624 -2.829 2.715 5.878 5.667 -0.117
2 0.212 4.650 -6.032 4.298 4.673 0.273
3 0.103 -1.583 -1.572 2.062 2.754 0.370
4 0.061 3.317 -3.330 1.715 1.913 -0.264
GMM Plot Result
0.623989625919 [[-2.82902581  2.71529475]] [ 5.40740094  6.11772078] -126.351279807
0.211909518681 [[ 4.6499868  -6.03222919]] [ 3.79797368  5.08858665] 143.529643797
0.103045358919 [[-1.58320463 -1.57176571]] [ 1.79946883  2.93228989] 154.228205133
0.0610554964806 [[ 3.31687794 -3.32990586]] [ 1.53751468  2.05870886] -146.259908775
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.006 0.024 1.420108e-08 0.021 0.132
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.021 1.340386e-08 0.019 0.128
Wall time: 39.3 s

7.2 Cross-validation, to select the number of Gaussian

In [92]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
 31903.5 10634.5
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.054581 0.028436 9.176948e-08 0.050671 0.335399 0.932428
1 0.052638 0.028021 9.213233e-08 0.050516 0.336012 0.932319
2 0.056251 0.028755 9.461957e-08 0.051894 0.340534 0.931103
3 0.058749 0.029890 9.796197e-08 0.053278 0.346432 0.928508
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.062770 0.029829 1.039218e-07 0.054092 0.356729 0.925401
1 0.069918 0.036070 1.057397e-07 0.056000 0.359992 0.923520
2 0.059081 0.027761 9.190692e-08 0.050470 0.335571 0.931696
3 0.052641 0.030148 8.990286e-08 0.048631 0.332078 0.933688
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.035281 0.010389 3.697402e-08 0.032402 0.212887 0.972856
1 0.036193 0.009424 3.778744e-08 0.032692 0.215175 0.972319
2 0.037051 0.010557 3.880115e-08 0.033097 0.218070 0.971745
3 0.034488 0.009294 3.546394e-08 0.031611 0.208460 0.973974
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.038938 0.013916 4.239408e-08 0.034171 0.227861 0.969284
1 0.041200 0.018340 4.082532e-08 0.033951 0.223734 0.970246
2 0.045695 0.011882 3.767310e-08 0.032706 0.214842 0.972037
3 0.039297 0.022557 4.753914e-08 0.036848 0.241412 0.965524
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.031407 0.009394 2.000891e-08 0.023561 0.156606 0.985329
1 0.036617 0.010541 3.426084e-08 0.031214 0.204855 0.974993
2 0.029884 0.010197 1.876752e-08 0.023099 0.151701 0.986228
3 0.033035 0.009820 1.902707e-08 0.023265 0.152679 0.986071
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.034246 0.009089 2.095682e-08 0.025000 0.160211 0.984747
1 0.051519 0.014084 3.912575e-08 0.032830 0.219135 0.971121
2 0.037701 0.020830 2.679278e-08 0.027122 0.181041 0.980573
3 0.034077 0.011679 2.244061e-08 0.024909 0.165905 0.983575
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.022085 0.007100 1.364429e-08 0.019723 0.129287 0.990003
1 0.020534 0.006916 1.284977e-08 0.019009 0.125476 0.990594
2 0.020232 0.006660 1.254671e-08 0.018966 0.124007 0.990802
3 0.023724 0.013706 1.379933e-08 0.019587 0.130070 0.989911
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.029835 0.010243 1.643230e-08 0.021269 0.141982 0.988019
1 0.027846 0.010336 1.865345e-08 0.022809 0.151238 0.986362
2 0.026374 0.012339 1.830440e-08 0.022283 0.149745 0.986667
3 0.027176 0.016048 1.914767e-08 0.023870 0.153087 0.985953
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.015271 0.008121 9.084473e-09 0.015865 0.105510 0.993346
1 0.019261 0.014634 9.800222e-09 0.016589 0.109598 0.992777
2 0.015991 0.007760 1.015675e-08 0.016922 0.111552 0.992608
3 0.024514 0.009358 1.371533e-08 0.019924 0.129657 0.989968
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.028593 0.015367 1.636776e-08 0.021963 0.141641 0.988061
1 0.026873 0.017678 1.823408e-08 0.022149 0.149455 0.986954
2 0.022259 0.008107 1.232078e-08 0.018742 0.122924 0.990831
3 0.028112 0.015665 2.179410e-08 0.023899 0.163385 0.984035
Wall time: 1min 14s
In [93]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.055555 0.028776 9.412084e-08 0.051589 0.339594 0.931089
2 0.035753 0.009916 3.725664e-08 0.032451 0.213648 0.972724
3 0.032735 0.009988 2.301609e-08 0.025285 0.166460 0.983155
4 0.021644 0.008595 1.321003e-08 0.019321 0.127210 0.990327
5 0.018759 0.009968 1.068919e-08 0.017325 0.114079 0.992174
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.061103 0.030952 9.786781e-08 0.052299 0.346092 0.928576
2 0.041282 0.016674 4.210791e-08 0.034419 0.226962 0.969272
3 0.039386 0.013920 2.732899e-08 0.027465 0.181573 0.980004
4 0.027808 0.012242 1.813446e-08 0.022558 0.149013 0.986750
5 0.026459 0.014204 1.717918e-08 0.021688 0.144351 0.987470
In [94]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [95]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)